package com.scrapy4j.htmlp.extract;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 解析网页属性值
 *
 * 作者/来源
 */
public class Attr {

    public static String parse(String txt, String regex) {
        return parse(txt, regex, 0);
    }

    public static String parse(String txt, String regex, int length) {
        String source = "";
        if (txt == null) {
            return "";
        }

        // 取出属性相关数据
        Pattern pattern = Pattern.compile(regex);
        Map<String, Integer> map = new HashMap<String, Integer>();
        txt = txt.replaceAll("\\|", "\n");
        String[] lines = txt.split("\n");
        for (int i = 0; i < lines.length; i++) {
            String line = lines[i];
            Matcher matcher = pattern.matcher(line);
            if(matcher.find()) {
                line = line.replaceAll(":|：", "").trim();
                String[] temps = line.split(regex);
                String s = temps.length >= 2 ? temps[1].trim() : "";
                s = s.matches(",|，") ? "" : s;
                if(!s.isEmpty()) {
                    map.put(s, (map.get(s) == null ? 0 : map.get(s)) + 1);
                } else {
                    if((i+1) > lines.length) {
                        break;
                    }
                    lines[i+1] = regex + lines[i+1];
                }
            }

        }



        // 没匹配到，说明没有属性数据
        if(map.size() == 0) {
            return source;
        }

        // 可能有多个属性值，对其进行排序，取出现次数最多的
        Object[] obj = map.values().toArray();
        Arrays.sort(obj);

        // 获取最大次数
        int max = Integer.parseInt(obj[map.size()-1].toString());
        for (String k: map.keySet()) {
            if(map.get(k) == max) {
                source = k;
                break;
            }
        }

        // 处理尾巴
        String[] sources = source.split(" ");
        source = sources != null && sources.length > 0 ? sources[0] : source;
        source = source == null ? "" : source.replace("\n", "").trim();

        if (length > 0 && source.length() > length) {
            source = source.substring(0, length);
        }

        // 遇到括号就停下来，通常，做为属于括号属于边界行为
        for (String s: new String[]{"\\)", "）", "]", "】"}) {
            if(source.contains(s)) {
                source = source.split(s)[0];
                break;
            }
        }

        return source;
    }
}
